-----------------------

Data Exploration & Visualization.

-----------------------

In [1]:
# Category distribution
from pathlib import Path

import os
import seaborn as sns
import matplotlib.pyplot as plt


%matplotlib inline

FOLDER_TRN = Path('train/')

fnames = os.listdir(FOLDER_TRN)
labels = []
for fname in fnames:
    labels.append(fname.split('.')[0])

sns.countplot(labels)
plt.title('Cats vs Dogs');
In [11]:
# Check mean aspect ratio (width/height), mean width and mean height
from keras.preprocessing import image

import numpy as np


imgs_width = [image.load_img(FOLDER_TRN / name).width for name in fnames]
imgs_height = [image.load_img(FOLDER_TRN / name).height for name in fnames]
aspect_ratio = np.array(imgs_width).astype(np.float64) / np.array(imgs_height).astype(np.float64)
In [65]:
# Scatter diagram of images width&height
plt.scatter(imgs_width, imgs_height, c='y', s=15);
plt.xlabel('width (mean:{:.3f})'.format(np.mean(imgs_width)))
plt.ylabel('height (mean:{:.3f})'.format(np.mean(imgs_height)));
In [67]:
# Statistical feature for image width&height 
import pandas as pd


df_img_size = pd.DataFrame({
    'width': imgs_width,
    'height': imgs_height
})
df_img_size.describe()
Out[67]:
width height
count 25000.00000 25000.000000
mean 404.09904 360.478080
std 109.03793 97.019959
min 42.00000 32.000000
25% 323.00000 301.000000
50% 447.00000 374.000000
75% 499.00000 421.000000
max 1050.00000 768.000000
In [39]:
# Plotting aspect ratio
plt.plot(aspect_ratio, c='r')
plt.title('Ratio: Width / Height (mean: {:.3f})'.format(aspect_ratio.mean()));
In [87]:
# Aspect ratio vary so much is not good, let's record these files for the moment
from pprint import pprint


# upper&lower quartiles
quartile_ratio = np.percentile(aspect_ratio, [25, 75])
print('Quartile for aspect ratio, upper: {:.3f}; lower: {:.3f}'.format(quartile_ratio[0], quartile_ratio[1]))
vary = [fnames[i] for i, ratio in enumerate(aspect_ratio) if ratio < quartile_ratio[0] / 2 or ratio > quartile_ratio[1] * 2]
print('There are {} files have unusual aspect ratio:\n'.format(len(vary)))
pprint(vary)
Quartile for aspect ratio, upper: 0.930; lower: 1.337
There are 58 files have unusual aspect ratio:

['cat.10119.jpg',
 'dog.4367.jpg',
 'cat.3324.jpg',
 'cat.8902.jpg',
 'cat.9954.jpg',
 'cat.3567.jpg',
 'dog.3330.jpg',
 'cat.5351.jpg',
 'dog.1741.jpg',
 'dog.4113.jpg',
 'dog.10749.jpg',
 'cat.712.jpg',
 'cat.8542.jpg',
 'cat.11349.jpg',
 'dog.3380.jpg',
 'cat.10192.jpg',
 'cat.1723.jpg',
 'cat.11060.jpg',
 'cat.11255.jpg',
 'dog.2478.jpg',
 'dog.9632.jpg',
 'cat.5773.jpg',
 'cat.5929.jpg',
 'dog.7294.jpg',
 'dog.1483.jpg',
 'cat.728.jpg',
 'cat.188.jpg',
 'dog.3135.jpg',
 'cat.12420.jpg',
 'cat.12243.jpg',
 'cat.9552.jpg',
 'dog.3139.jpg',
 'cat.9171.jpg',
 'cat.8755.jpg',
 'dog.2503.jpg',
 'dog.11526.jpg',
 'dog.12331.jpg',
 'cat.6232.jpg',
 'dog.11248.jpg',
 'cat.3370.jpg',
 'dog.8739.jpg',
 'cat.5964.jpg',
 'dog.7857.jpg',
 'cat.9819.jpg',
 'dog.5880.jpg',
 'cat.10988.jpg',
 'dog.10199.jpg',
 'cat.664.jpg',
 'cat.11512.jpg',
 'dog.7019.jpg',
 'dog.1985.jpg',
 'dog.2874.jpg',
 'cat.11149.jpg',
 'dog.8142.jpg',
 'dog.4712.jpg',
 'dog.3863.jpg',
 'cat.5981.jpg',
 'cat.7622.jpg']
In [88]:
# Lets take a look for part of these images with unusual aspect ratio
# We can see some are horizontally stretched, others are vertically stretched
nrows, ncols = (5, 6)
fig, ax = plt.subplots(nrows, ncols, figsize=(25, 25))
plt.subplots_adjust(hspace=.9)

for num in range(nrows*ncols):
    file_name = vary[num]
    file_path = FOLDER_TRN / file_name
    label = file_name.split('.')[0]
    img = plt.imread(file_path)
    
    ax[num // ncols, num % ncols].imshow(img)
    ax[num // ncols, num % ncols].set_title("file: {}\n label: {}".format(file_name, label), size=12)
    ax[num // ncols, num % ncols].get_xaxis().set_visible(False)
    ax[num // ncols, num % ncols].get_yaxis().set_visible(False)

-----------------------

Dogs & Cats Category Codes In ImageNet.

-----------------------

In [2]:
dogs_code = [
    'n02085620','n02085782','n02085936','n02086079',
    'n02086240','n02086646','n02086910','n02087046',
    'n02087394','n02088094','n02088238','n02088364',
    'n02088466','n02088632','n02089078','n02089867',
    'n02089973','n02090379','n02090622','n02090721',
    'n02091032','n02091134','n02091244','n02091467',
    'n02091635','n02091831','n02092002','n02092339',
    'n02093256','n02093428','n02093647','n02093754',
    'n02093859','n02093991','n02094114','n02094258',
    'n02094433','n02095314','n02095570','n02095889',
    'n02096051','n02096177','n02096294','n02096437',
    'n02096585','n02097047','n02097130','n02097209',
    'n02097298','n02097474','n02097658','n02098105',
    'n02098286','n02098413','n02099267','n02099429',
    'n02099601','n02099712','n02099849','n02100236',
    'n02100583','n02100735','n02100877','n02101006',
    'n02101388','n02101556','n02102040','n02102177',
    'n02102318','n02102480','n02102973','n02104029',
    'n02104365','n02105056','n02105162','n02105251',
    'n02105412','n02105505','n02105641','n02105855',
    'n02106030','n02106166','n02106382','n02106550',
    'n02106662','n02107142','n02107312','n02107574',
    'n02107683','n02107908','n02108000','n02108089',
    'n02108422','n02108551','n02108915','n02109047',
    'n02109525','n02109961','n02110063','n02110185',
    'n02110341','n02110627','n02110806','n02110958',
    'n02111129','n02111277','n02111500','n02111889',
    'n02112018','n02112137','n02112350','n02112706',
    'n02113023','n02113186','n02113624','n02113712',
    'n02113799','n02113978'
]

cats_code = [
    'n02123045','n02123159','n02123394','n02123597',
    'n02124075','n02125311','n02127052'
]

-----------------------------

GPU Resources

-----------------------------

In [3]:
# checkout GPU
!nvidia-smi
Thu Aug 29 01:40:13 2019       
+-----------------------------------------------------------------------------+
| NVIDIA-SMI 418.40.04    Driver Version: 418.40.04    CUDA Version: 10.1     |
|-------------------------------+----------------------+----------------------+
| GPU  Name        Persistence-M| Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp  Perf  Pwr:Usage/Cap|         Memory-Usage | GPU-Util  Compute M. |
|===============================+======================+======================|
|   0  Tesla K80           On   | 00000000:00:1E.0 Off |                    0 |
| N/A   45C    P8    28W / 149W |      0MiB / 11441MiB |      0%      Default |
+-------------------------------+----------------------+----------------------+
                                                                               
+-----------------------------------------------------------------------------+
| Processes:                                                       GPU Memory |
|  GPU       PID   Type   Process name                             Usage      |
|=============================================================================|
|  No running processes found                                                 |
+-----------------------------------------------------------------------------+

-----------------------------

Data Cleaning

-----------------------------

In [89]:
# Using pretrained models to find out abnormal images
from keras.applications import Xception, xception, InceptionV3, inception_v3, ResNet50, resnet50, InceptionResNetV2, inception_resnet_v2, \
    NASNetLarge, nasnet

import tensorflow as tf
import keras.backend.tensorflow_backend as KTF


KTF.set_session(tf.Session(config=tf.ConfigProto(device_count={'gpu':0})))

CF_FOLDER = Path('cheatfiles/')


def find_out_cheat_files(model, preprocess_func, decode, img_size):
    """
        use pretrained model(on imagenet) to find out images that may confuse my model in later training step.
    """
    
    cheat_files = []
    for fname in fnames:
        category = cats_code if fname.split('.')[0] == 'cat' else dogs_code
        img_path = FOLDER_TRN / fname
        img = image.load_img(img_path, target_size=img_size)
        arr = image.img_to_array(img)
        tensor = np.expand_dims(arr, axis=0)
        tensor = preprocess_func(tensor)
        preds = model.predict(tensor)
        preds_decode, _, __ = zip(*decode(preds, top=30)[0])

        if not np.intersect1d(preds_decode, category).size:
            cheat_files.append(fname)
    
    return cheat_files
In [10]:
# Found out by Xception
cheatfiles_xception = find_out_cheat_files(
    model=Xception(weights='imagenet'),
    preprocess_func=xception.preprocess_input,
    decode=xception.decode_predictions,
    img_size=(299, 299)
)
In [14]:
# Record the identification by Xception
np.savez(CF_FOLDER / 'xception_top30.npz', xception=cheatfiles_xception)
In [5]:
cheatfiles_xception = np.load(CF_FOLDER / 'xception_top30.npz')['xception']
In [6]:
len(cheatfiles_xception)
Out[6]:
103
In [16]:
# Found out by inception_v3
cheatfiles_inception_v3 = find_out_cheat_files(
    model=InceptionV3(weights='imagenet'), 
    preprocess_func=inception_v3.preprocess_input, 
    decode=inception_v3.decode_predictions,
    img_size=(299, 299)
)
In [18]:
# Record the identification by InceptionV3
np.savez(CF_FOLDER / 'inception_v3_top30.npz', inception_v3=cheatfiles_inception_v3)
In [6]:
cheatfiles_inception_v3 = np.load(CF_FOLDER / 'inception_v3_top30.npz')['inception_v3']
In [8]:
len(cheatfiles_inception_v3)
Out[8]:
147
In [19]:
# Found out by resnet50
cheatfiles_resnet50 = find_out_cheat_files(
    model=ResNet50(weights='imagenet'), 
    preprocess_func=resnet50.preprocess_input, 
    decode=resnet50.decode_predictions, 
    img_size=(224, 224)
)
In [21]:
# Record the identification by ResNet50
np.savez(CF_FOLDER / 'resnet50_top30.npz', resnet50=cheatfiles_resnet50)
In [7]:
cheatfiles_resnet50 = np.load(CF_FOLDER / 'resnet50_top30.npz')['resnet50']
In [10]:
len(cheatfiles_resnet50)
Out[10]:
325
In [8]:
# Found out by NASNetMobile
cheatfiles_nasnet = find_out_cheat_files(
    model=NASNetLarge(weights='imagenet'), 
    preprocess_func=nasnet.preprocess_input, 
    decode=nasnet.decode_predictions, 
    img_size=(331, 331)
)
WARNING:tensorflow:From /home/ubuntu/anaconda3/envs/tensorflow_p36/lib/python3.6/site-packages/tensorflow/python/framework/op_def_library.py:263: colocate_with (from tensorflow.python.framework.ops) is deprecated and will be removed in a future version.
Instructions for updating:
Colocations handled automatically by placer.
In [9]:
# Record the identification by NASNetMobile
np.savez(CF_FOLDER / 'nasnet_top30.npz', nasnet=cheatfiles_nasnet)
In [8]:
cheatfiles_nasnet = np.load(CF_FOLDER / 'nasnet_top30.npz')['nasnet']
In [11]:
len(cheatfiles_nasnet)
Out[11]:
96
In [6]:
# Found out by InceptionResNetV2
cheatfiles_inception_resnet_v2 = find_out_cheat_files(
    model=InceptionResNetV2(weights='imagenet'), 
    preprocess_func=inception_resnet_v2.preprocess_input, 
    decode=inception_resnet_v2.decode_predictions, 
    img_size=(299, 299)
)
In [17]:
# Record the identification by InceptionResNet50V2
np.savez(CF_FOLDER / 'inception_resnet_v2_top30.npz', inception_resnet_v2=cheatfiles_inception_resnet_v2)
In [9]:
cheatfiles_inception_resnet_v2 = np.load(CF_FOLDER / 'inception_resnet_v2_top30.npz')['inception_resnet_v2']
In [16]:
len(cheatfiles_inception_resnet_v2)
Out[16]:
124
In [10]:
# Aggregate all the files that found out by above pretrained models
from functools import reduce


cheat_files_by_models = reduce(
    np.union1d, 
    (cheatfiles_xception, cheatfiles_inception_v3, cheatfiles_inception_resnet_v2, cheatfiles_resnet50, cheatfiles_nasnet)
)

print("models found out {} cheat files below:".format(len(cheat_files_by_models)))
pprint(cheat_files_by_models)
models found out 416 cheat files below:
array(['cat.10029.jpg', 'cat.10037.jpg', 'cat.10107.jpg', 'cat.10121.jpg',
       'cat.10209.jpg', 'cat.10220.jpg', 'cat.10266.jpg', 'cat.10270.jpg',
       'cat.10365.jpg', 'cat.10425.jpg', 'cat.10471.jpg', 'cat.10521.jpg',
       'cat.10532.jpg', 'cat.10536.jpg', 'cat.10539.jpg', 'cat.10579.jpg',
       'cat.10609.jpg', 'cat.10634.jpg', 'cat.10636.jpg', 'cat.10700.jpg',
       'cat.10712.jpg', 'cat.10743.jpg', 'cat.10807.jpg', 'cat.10863.jpg',
       'cat.10864.jpg', 'cat.10893.jpg', 'cat.10912.jpg', 'cat.10932.jpg',
       'cat.10946.jpg', 'cat.11018.jpg', 'cat.11039.jpg', 'cat.11062.jpg',
       'cat.11141.jpg', 'cat.11168.jpg', 'cat.11184.jpg', 'cat.11222.jpg',
       'cat.11231.jpg', 'cat.11255.jpg', 'cat.11256.jpg', 'cat.11281.jpg',
       'cat.11297.jpg', 'cat.1139.jpg', 'cat.11399.jpg', 'cat.114.jpg',
       'cat.11432.jpg', 'cat.11544.jpg', 'cat.11562.jpg', 'cat.11565.jpg',
       'cat.11607.jpg', 'cat.11608.jpg', 'cat.11634.jpg', 'cat.11661.jpg',
       'cat.11675.jpg', 'cat.11683.jpg', 'cat.11724.jpg', 'cat.11726.jpg',
       'cat.11777.jpg', 'cat.11823.jpg', 'cat.11870.jpg', 'cat.11879.jpg',
       'cat.11923.jpg', 'cat.11941.jpg', 'cat.11968.jpg', 'cat.11980.jpg',
       'cat.12126.jpg', 'cat.12182.jpg', 'cat.12189.jpg', 'cat.12219.jpg',
       'cat.12222.jpg', 'cat.12227.jpg', 'cat.12239.jpg', 'cat.12252.jpg',
       'cat.12272.jpg', 'cat.12326.jpg', 'cat.12359.jpg', 'cat.12378.jpg',
       'cat.12380.jpg', 'cat.12392.jpg', 'cat.124.jpg', 'cat.12424.jpg',
       'cat.12431.jpg', 'cat.12476.jpg', 'cat.12493.jpg', 'cat.12494.jpg',
       'cat.1277.jpg', 'cat.1361.jpg', 'cat.1380.jpg', 'cat.1423.jpg',
       'cat.1485.jpg', 'cat.1507.jpg', 'cat.1619.jpg', 'cat.1631.jpg',
       'cat.169.jpg', 'cat.1741.jpg', 'cat.1747.jpg', 'cat.1906.jpg',
       'cat.1928.jpg', 'cat.2047.jpg', 'cat.2150.jpg', 'cat.2267.jpg',
       'cat.2279.jpg', 'cat.2337.jpg', 'cat.2355.jpg', 'cat.2384.jpg',
       'cat.2429.jpg', 'cat.2433.jpg', 'cat.245.jpg', 'cat.2456.jpg',
       'cat.2457.jpg', 'cat.2470.jpg', 'cat.2494.jpg', 'cat.2509.jpg',
       'cat.252.jpg', 'cat.2520.jpg', 'cat.2598.jpg', 'cat.2621.jpg',
       'cat.2663.jpg', 'cat.2715.jpg', 'cat.2735.jpg', 'cat.2737.jpg',
       'cat.2748.jpg', 'cat.2817.jpg', 'cat.2835.jpg', 'cat.2845.jpg',
       'cat.2893.jpg', 'cat.2924.jpg', 'cat.2939.jpg', 'cat.2951.jpg',
       'cat.297.jpg', 'cat.3004.jpg', 'cat.3046.jpg', 'cat.3105.jpg',
       'cat.3123.jpg', 'cat.3176.jpg', 'cat.3181.jpg', 'cat.3216.jpg',
       'cat.3219.jpg', 'cat.3250.jpg', 'cat.335.jpg', 'cat.3369.jpg',
       'cat.3399.jpg', 'cat.3410.jpg', 'cat.343.jpg', 'cat.3472.jpg',
       'cat.354.jpg', 'cat.3543.jpg', 'cat.3566.jpg', 'cat.3637.jpg',
       'cat.3658.jpg', 'cat.3672.jpg', 'cat.3699.jpg', 'cat.3713.jpg',
       'cat.372.jpg', 'cat.3731.jpg', 'cat.3738.jpg', 'cat.3766.jpg',
       'cat.3822.jpg', 'cat.3845.jpg', 'cat.3859.jpg', 'cat.3868.jpg',
       'cat.388.jpg', 'cat.3892.jpg', 'cat.4076.jpg', 'cat.4100.jpg',
       'cat.4126.jpg', 'cat.4190.jpg', 'cat.4199.jpg', 'cat.4272.jpg',
       'cat.4308.jpg', 'cat.4338.jpg', 'cat.4360.jpg', 'cat.44.jpg',
       'cat.4432.jpg', 'cat.45.jpg', 'cat.4503.jpg', 'cat.4554.jpg',
       'cat.4575.jpg', 'cat.4688.jpg', 'cat.4786.jpg', 'cat.4807.jpg',
       'cat.4833.jpg', 'cat.4842.jpg', 'cat.4852.jpg', 'cat.4874.jpg',
       'cat.4965.jpg', 'cat.4976.jpg', 'cat.4985.jpg', 'cat.4986.jpg',
       'cat.499.jpg', 'cat.4994.jpg', 'cat.5.jpg', 'cat.503.jpg',
       'cat.5071.jpg', 'cat.5123.jpg', 'cat.5168.jpg', 'cat.5173.jpg',
       'cat.5176.jpg', 'cat.5184.jpg', 'cat.5241.jpg', 'cat.5324.jpg',
       'cat.5347.jpg', 'cat.5351.jpg', 'cat.5355.jpg', 'cat.539.jpg',
       'cat.5418.jpg', 'cat.5435.jpg', 'cat.5446.jpg', 'cat.5465.jpg',
       'cat.5472.jpg', 'cat.5502.jpg', 'cat.5527.jpg', 'cat.5534.jpg',
       'cat.5583.jpg', 'cat.5595.jpg', 'cat.5609.jpg', 'cat.5623.jpg',
       'cat.5712.jpg', 'cat.5733.jpg', 'cat.5773.jpg', 'cat.5795.jpg',
       'cat.5804.jpg', 'cat.5816.jpg', 'cat.5818.jpg', 'cat.5820.jpg',
       'cat.5843.jpg', 'cat.587.jpg', 'cat.5954.jpg', 'cat.5974.jpg',
       'cat.5981.jpg', 'cat.599.jpg', 'cat.6.jpg', 'cat.6017.jpg',
       'cat.6085.jpg', 'cat.6086.jpg', 'cat.6235.jpg', 'cat.6262.jpg',
       'cat.6272.jpg', 'cat.6307.jpg', 'cat.6345.jpg', 'cat.6348.jpg',
       'cat.6402.jpg', 'cat.6429.jpg', 'cat.6442.jpg', 'cat.6451.jpg',
       'cat.6526.jpg', 'cat.6530.jpg', 'cat.6543.jpg', 'cat.6568.jpg',
       'cat.6590.jpg', 'cat.6655.jpg', 'cat.6696.jpg', 'cat.6699.jpg',
       'cat.6703.jpg', 'cat.6734.jpg', 'cat.674.jpg', 'cat.6781.jpg',
       'cat.6868.jpg', 'cat.6900.jpg', 'cat.6906.jpg', 'cat.6915.jpg',
       'cat.6965.jpg', 'cat.7001.jpg', 'cat.7009.jpg', 'cat.704.jpg',
       'cat.7047.jpg', 'cat.712.jpg', 'cat.7194.jpg', 'cat.7206.jpg',
       'cat.724.jpg', 'cat.7263.jpg', 'cat.7281.jpg', 'cat.7291.jpg',
       'cat.7296.jpg', 'cat.7300.jpg', 'cat.7312.jpg', 'cat.7354.jpg',
       'cat.7372.jpg', 'cat.7377.jpg', 'cat.738.jpg', 'cat.7411.jpg',
       'cat.7416.jpg', 'cat.7429.jpg', 'cat.7432.jpg', 'cat.7464.jpg',
       'cat.7487.jpg', 'cat.7493.jpg', 'cat.7526.jpg', 'cat.7545.jpg',
       'cat.7550.jpg', 'cat.7564.jpg', 'cat.7599.jpg', 'cat.7604.jpg',
       'cat.7661.jpg', 'cat.767.jpg', 'cat.7671.jpg', 'cat.7682.jpg',
       'cat.7703.jpg', 'cat.7707.jpg', 'cat.7728.jpg', 'cat.7738.jpg',
       'cat.7758.jpg', 'cat.7897.jpg', 'cat.7899.jpg', 'cat.7919.jpg',
       'cat.7920.jpg', 'cat.796.jpg', 'cat.7964.jpg', 'cat.7968.jpg',
       'cat.8033.jpg', 'cat.8044.jpg', 'cat.8056.jpg', 'cat.8068.jpg',
       'cat.8087.jpg', 'cat.8092.jpg', 'cat.8118.jpg', 'cat.8122.jpg',
       'cat.8138.jpg', 'cat.8200.jpg', 'cat.8289.jpg', 'cat.8369.jpg',
       'cat.8383.jpg', 'cat.8431.jpg', 'cat.8448.jpg', 'cat.8456.jpg',
       'cat.8470.jpg', 'cat.8487.jpg', 'cat.8504.jpg', 'cat.8542.jpg',
       'cat.8576.jpg', 'cat.8647.jpg', 'cat.8657.jpg', 'cat.8675.jpg',
       'cat.8744.jpg', 'cat.8755.jpg', 'cat.8765.jpg', 'cat.8828.jpg',
       'cat.883.jpg', 'cat.8854.jpg', 'cat.8914.jpg', 'cat.8921.jpg',
       'cat.9006.jpg', 'cat.9066.jpg', 'cat.908.jpg', 'cat.9090.jpg',
       'cat.9110.jpg', 'cat.9144.jpg', 'cat.9171.jpg', 'cat.9250.jpg',
       'cat.9277.jpg', 'cat.9288.jpg', 'cat.9290.jpg', 'cat.933.jpg',
       'cat.934.jpg', 'cat.9360.jpg', 'cat.9444.jpg', 'cat.9445.jpg',
       'cat.9456.jpg', 'cat.9458.jpg', 'cat.9494.jpg', 'cat.9499.jpg',
       'cat.9513.jpg', 'cat.9520.jpg', 'cat.9552.jpg', 'cat.9589.jpg',
       'cat.9596.jpg', 'cat.9622.jpg', 'cat.9642.jpg', 'cat.9695.jpg',
       'cat.9836.jpg', 'cat.9882.jpg', 'cat.9925.jpg', 'cat.9947.jpg',
       'cat.9983.jpg', 'cat.9986.jpg', 'cat.9987.jpg', 'dog.10161.jpg',
       'dog.10190.jpg', 'dog.10225.jpg', 'dog.10237.jpg', 'dog.10747.jpg',
       'dog.10801.jpg', 'dog.11266.jpg', 'dog.11299.jpg', 'dog.11437.jpg',
       'dog.1194.jpg', 'dog.12148.jpg', 'dog.12155.jpg', 'dog.12376.jpg',
       'dog.1259.jpg', 'dog.1308.jpg', 'dog.1625.jpg', 'dog.1773.jpg',
       'dog.1895.jpg', 'dog.2339.jpg', 'dog.2422.jpg', 'dog.2614.jpg',
       'dog.3035.jpg', 'dog.3497.jpg', 'dog.3889.jpg', 'dog.4218.jpg',
       'dog.4367.jpg', 'dog.4507.jpg', 'dog.4595.jpg', 'dog.5336.jpg',
       'dog.5604.jpg', 'dog.6028.jpg', 'dog.6413.jpg', 'dog.6475.jpg',
       'dog.6725.jpg', 'dog.7076.jpg', 'dog.729.jpg', 'dog.806.jpg',
       'dog.8736.jpg', 'dog.8898.jpg', 'dog.9188.jpg', 'dog.9517.jpg'],
      dtype='<U13')
In [11]:
# Record the identification by all models
np.savez(CF_FOLDER / 'models_all_top30.npz', all=cheat_files_by_models)
In [90]:
cheat_files_by_models = np.load(CF_FOLDER / 'models_all_top30.npz')['all']
In [91]:
# Abnormal images listed on kaggle discussion forum
import pandas as pd

from pprint import pprint

# The following files listed on discussion forum for reference
df_cheat = pd.read_csv('cheat.csv')
arr_cheat = df_cheat['filename'].values
arr_cheat = [name + '.jpg' for name in arr_cheat if not name.endswith('.jpg')]
print("furthermore, there are {} cheat files collected by discussion forum:".format(len(arr_cheat)))
pprint(arr_cheat)
furthermore, there are 49 cheat files collected by discussion forum:
['dog.11731.jpg',
 'dog.4334.jpg',
 'cat.4688.jpg',
 'cat.11222.jpg',
 'cat.1450.jpg',
 'cat.2159.jpg',
 'cat.3822.jpg',
 'cat.4104.jpg',
 'cat.5355.jpg',
 'cat.7194.jpg',
 'cat.7920.jpg',
 'cat.9250.jpg',
 'cat.9444.jpg',
 'cat.9882.jpg',
 'dog.11538.jpg',
 'dog.11724.jpg',
 'dog.8507.jpg',
 'cat.2939.jpg',
 'cat.3216.jpg',
 'cat.4833.jpg',
 'cat.7968.jpg',
 'cat.8470.jpg',
 'dog.10161.jpg',
 'dog.10190.jpg',
 'dog.11186.jpg',
 'dog.1308.jpg',
 'dog.1895.jpg',
 'dog.9188.jpg',
 'cat.5418.jpg',
 'dog.10747.jpg',
 'dog.2614.jpg',
 'dog.4367.jpg',
 'dog.8736.jpg',
 'cat.7377.jpg',
 'dog.12376.jpg',
 'dog.1773.jpg',
 'cat.10712.jpg',
 'cat.11184.jpg',
 'cat.7564.jpg',
 'cat.8456.jpg',
 'dog.10237.jpg',
 'dog.1043.jpg',
 'dog.1194.jpg',
 'dog.5604.jpg',
 'dog.9517.jpg',
 'cat.11565.jpg',
 'dog.10797.jpg',
 'dog.2877.jpg',
 'dog.8898.jpg']
In [98]:
# Cheat files aggregation
# cheatfiles_all = np.union1d(cheat_files_by_models, arr_cheat)
cheatfiles_all = reduce(np.union1d, (vary, cheat_files_by_models, arr_cheat))
print("From the above, there are {} cheat files in all, as below:".format(len(cheatfiles_all)))
pprint(cheatfiles_all)
From the above, there are 476 cheat files in all, as below:
array(['cat.10029.jpg', 'cat.10037.jpg', 'cat.10107.jpg', 'cat.10119.jpg',
       'cat.10121.jpg', 'cat.10192.jpg', 'cat.10209.jpg', 'cat.10220.jpg',
       'cat.10266.jpg', 'cat.10270.jpg', 'cat.10365.jpg', 'cat.10425.jpg',
       'cat.10471.jpg', 'cat.10521.jpg', 'cat.10532.jpg', 'cat.10536.jpg',
       'cat.10539.jpg', 'cat.10579.jpg', 'cat.10609.jpg', 'cat.10634.jpg',
       'cat.10636.jpg', 'cat.10700.jpg', 'cat.10712.jpg', 'cat.10743.jpg',
       'cat.10807.jpg', 'cat.10863.jpg', 'cat.10864.jpg', 'cat.10893.jpg',
       'cat.10912.jpg', 'cat.10932.jpg', 'cat.10946.jpg', 'cat.10988.jpg',
       'cat.11018.jpg', 'cat.11039.jpg', 'cat.11060.jpg', 'cat.11062.jpg',
       'cat.11141.jpg', 'cat.11149.jpg', 'cat.11168.jpg', 'cat.11184.jpg',
       'cat.11222.jpg', 'cat.11231.jpg', 'cat.11255.jpg', 'cat.11256.jpg',
       'cat.11281.jpg', 'cat.11297.jpg', 'cat.11349.jpg', 'cat.1139.jpg',
       'cat.11399.jpg', 'cat.114.jpg', 'cat.11432.jpg', 'cat.11512.jpg',
       'cat.11544.jpg', 'cat.11562.jpg', 'cat.11565.jpg', 'cat.11607.jpg',
       'cat.11608.jpg', 'cat.11634.jpg', 'cat.11661.jpg', 'cat.11675.jpg',
       'cat.11683.jpg', 'cat.11724.jpg', 'cat.11726.jpg', 'cat.11777.jpg',
       'cat.11823.jpg', 'cat.11870.jpg', 'cat.11879.jpg', 'cat.11923.jpg',
       'cat.11941.jpg', 'cat.11968.jpg', 'cat.11980.jpg', 'cat.12126.jpg',
       'cat.12182.jpg', 'cat.12189.jpg', 'cat.12219.jpg', 'cat.12222.jpg',
       'cat.12227.jpg', 'cat.12239.jpg', 'cat.12243.jpg', 'cat.12252.jpg',
       'cat.12272.jpg', 'cat.12326.jpg', 'cat.12359.jpg', 'cat.12378.jpg',
       'cat.12380.jpg', 'cat.12392.jpg', 'cat.124.jpg', 'cat.12420.jpg',
       'cat.12424.jpg', 'cat.12431.jpg', 'cat.12476.jpg', 'cat.12493.jpg',
       'cat.12494.jpg', 'cat.1277.jpg', 'cat.1361.jpg', 'cat.1380.jpg',
       'cat.1423.jpg', 'cat.1450.jpg', 'cat.1485.jpg', 'cat.1507.jpg',
       'cat.1619.jpg', 'cat.1631.jpg', 'cat.169.jpg', 'cat.1723.jpg',
       'cat.1741.jpg', 'cat.1747.jpg', 'cat.188.jpg', 'cat.1906.jpg',
       'cat.1928.jpg', 'cat.2047.jpg', 'cat.2150.jpg', 'cat.2159.jpg',
       'cat.2267.jpg', 'cat.2279.jpg', 'cat.2337.jpg', 'cat.2355.jpg',
       'cat.2384.jpg', 'cat.2429.jpg', 'cat.2433.jpg', 'cat.245.jpg',
       'cat.2456.jpg', 'cat.2457.jpg', 'cat.2470.jpg', 'cat.2494.jpg',
       'cat.2509.jpg', 'cat.252.jpg', 'cat.2520.jpg', 'cat.2598.jpg',
       'cat.2621.jpg', 'cat.2663.jpg', 'cat.2715.jpg', 'cat.2735.jpg',
       'cat.2737.jpg', 'cat.2748.jpg', 'cat.2817.jpg', 'cat.2835.jpg',
       'cat.2845.jpg', 'cat.2893.jpg', 'cat.2924.jpg', 'cat.2939.jpg',
       'cat.2951.jpg', 'cat.297.jpg', 'cat.3004.jpg', 'cat.3046.jpg',
       'cat.3105.jpg', 'cat.3123.jpg', 'cat.3176.jpg', 'cat.3181.jpg',
       'cat.3216.jpg', 'cat.3219.jpg', 'cat.3250.jpg', 'cat.3324.jpg',
       'cat.335.jpg', 'cat.3369.jpg', 'cat.3370.jpg', 'cat.3399.jpg',
       'cat.3410.jpg', 'cat.343.jpg', 'cat.3472.jpg', 'cat.354.jpg',
       'cat.3543.jpg', 'cat.3566.jpg', 'cat.3567.jpg', 'cat.3637.jpg',
       'cat.3658.jpg', 'cat.3672.jpg', 'cat.3699.jpg', 'cat.3713.jpg',
       'cat.372.jpg', 'cat.3731.jpg', 'cat.3738.jpg', 'cat.3766.jpg',
       'cat.3822.jpg', 'cat.3845.jpg', 'cat.3859.jpg', 'cat.3868.jpg',
       'cat.388.jpg', 'cat.3892.jpg', 'cat.4076.jpg', 'cat.4100.jpg',
       'cat.4104.jpg', 'cat.4126.jpg', 'cat.4190.jpg', 'cat.4199.jpg',
       'cat.4272.jpg', 'cat.4308.jpg', 'cat.4338.jpg', 'cat.4360.jpg',
       'cat.44.jpg', 'cat.4432.jpg', 'cat.45.jpg', 'cat.4503.jpg',
       'cat.4554.jpg', 'cat.4575.jpg', 'cat.4688.jpg', 'cat.4786.jpg',
       'cat.4807.jpg', 'cat.4833.jpg', 'cat.4842.jpg', 'cat.4852.jpg',
       'cat.4874.jpg', 'cat.4965.jpg', 'cat.4976.jpg', 'cat.4985.jpg',
       'cat.4986.jpg', 'cat.499.jpg', 'cat.4994.jpg', 'cat.5.jpg',
       'cat.503.jpg', 'cat.5071.jpg', 'cat.5123.jpg', 'cat.5168.jpg',
       'cat.5173.jpg', 'cat.5176.jpg', 'cat.5184.jpg', 'cat.5241.jpg',
       'cat.5324.jpg', 'cat.5347.jpg', 'cat.5351.jpg', 'cat.5355.jpg',
       'cat.539.jpg', 'cat.5418.jpg', 'cat.5435.jpg', 'cat.5446.jpg',
       'cat.5465.jpg', 'cat.5472.jpg', 'cat.5502.jpg', 'cat.5527.jpg',
       'cat.5534.jpg', 'cat.5583.jpg', 'cat.5595.jpg', 'cat.5609.jpg',
       'cat.5623.jpg', 'cat.5712.jpg', 'cat.5733.jpg', 'cat.5773.jpg',
       'cat.5795.jpg', 'cat.5804.jpg', 'cat.5816.jpg', 'cat.5818.jpg',
       'cat.5820.jpg', 'cat.5843.jpg', 'cat.587.jpg', 'cat.5929.jpg',
       'cat.5954.jpg', 'cat.5964.jpg', 'cat.5974.jpg', 'cat.5981.jpg',
       'cat.599.jpg', 'cat.6.jpg', 'cat.6017.jpg', 'cat.6085.jpg',
       'cat.6086.jpg', 'cat.6232.jpg', 'cat.6235.jpg', 'cat.6262.jpg',
       'cat.6272.jpg', 'cat.6307.jpg', 'cat.6345.jpg', 'cat.6348.jpg',
       'cat.6402.jpg', 'cat.6429.jpg', 'cat.6442.jpg', 'cat.6451.jpg',
       'cat.6526.jpg', 'cat.6530.jpg', 'cat.6543.jpg', 'cat.6568.jpg',
       'cat.6590.jpg', 'cat.664.jpg', 'cat.6655.jpg', 'cat.6696.jpg',
       'cat.6699.jpg', 'cat.6703.jpg', 'cat.6734.jpg', 'cat.674.jpg',
       'cat.6781.jpg', 'cat.6868.jpg', 'cat.6900.jpg', 'cat.6906.jpg',
       'cat.6915.jpg', 'cat.6965.jpg', 'cat.7001.jpg', 'cat.7009.jpg',
       'cat.704.jpg', 'cat.7047.jpg', 'cat.712.jpg', 'cat.7194.jpg',
       'cat.7206.jpg', 'cat.724.jpg', 'cat.7263.jpg', 'cat.728.jpg',
       'cat.7281.jpg', 'cat.7291.jpg', 'cat.7296.jpg', 'cat.7300.jpg',
       'cat.7312.jpg', 'cat.7354.jpg', 'cat.7372.jpg', 'cat.7377.jpg',
       'cat.738.jpg', 'cat.7411.jpg', 'cat.7416.jpg', 'cat.7429.jpg',
       'cat.7432.jpg', 'cat.7464.jpg', 'cat.7487.jpg', 'cat.7493.jpg',
       'cat.7526.jpg', 'cat.7545.jpg', 'cat.7550.jpg', 'cat.7564.jpg',
       'cat.7599.jpg', 'cat.7604.jpg', 'cat.7622.jpg', 'cat.7661.jpg',
       'cat.767.jpg', 'cat.7671.jpg', 'cat.7682.jpg', 'cat.7703.jpg',
       'cat.7707.jpg', 'cat.7728.jpg', 'cat.7738.jpg', 'cat.7758.jpg',
       'cat.7897.jpg', 'cat.7899.jpg', 'cat.7919.jpg', 'cat.7920.jpg',
       'cat.796.jpg', 'cat.7964.jpg', 'cat.7968.jpg', 'cat.8033.jpg',
       'cat.8044.jpg', 'cat.8056.jpg', 'cat.8068.jpg', 'cat.8087.jpg',
       'cat.8092.jpg', 'cat.8118.jpg', 'cat.8122.jpg', 'cat.8138.jpg',
       'cat.8200.jpg', 'cat.8289.jpg', 'cat.8369.jpg', 'cat.8383.jpg',
       'cat.8431.jpg', 'cat.8448.jpg', 'cat.8456.jpg', 'cat.8470.jpg',
       'cat.8487.jpg', 'cat.8504.jpg', 'cat.8542.jpg', 'cat.8576.jpg',
       'cat.8647.jpg', 'cat.8657.jpg', 'cat.8675.jpg', 'cat.8744.jpg',
       'cat.8755.jpg', 'cat.8765.jpg', 'cat.8828.jpg', 'cat.883.jpg',
       'cat.8854.jpg', 'cat.8902.jpg', 'cat.8914.jpg', 'cat.8921.jpg',
       'cat.9006.jpg', 'cat.9066.jpg', 'cat.908.jpg', 'cat.9090.jpg',
       'cat.9110.jpg', 'cat.9144.jpg', 'cat.9171.jpg', 'cat.9250.jpg',
       'cat.9277.jpg', 'cat.9288.jpg', 'cat.9290.jpg', 'cat.933.jpg',
       'cat.934.jpg', 'cat.9360.jpg', 'cat.9444.jpg', 'cat.9445.jpg',
       'cat.9456.jpg', 'cat.9458.jpg', 'cat.9494.jpg', 'cat.9499.jpg',
       'cat.9513.jpg', 'cat.9520.jpg', 'cat.9552.jpg', 'cat.9589.jpg',
       'cat.9596.jpg', 'cat.9622.jpg', 'cat.9642.jpg', 'cat.9695.jpg',
       'cat.9819.jpg', 'cat.9836.jpg', 'cat.9882.jpg', 'cat.9925.jpg',
       'cat.9947.jpg', 'cat.9954.jpg', 'cat.9983.jpg', 'cat.9986.jpg',
       'cat.9987.jpg', 'dog.10161.jpg', 'dog.10190.jpg', 'dog.10199.jpg',
       'dog.10225.jpg', 'dog.10237.jpg', 'dog.1043.jpg', 'dog.10747.jpg',
       'dog.10749.jpg', 'dog.10797.jpg', 'dog.10801.jpg', 'dog.11186.jpg',
       'dog.11248.jpg', 'dog.11266.jpg', 'dog.11299.jpg', 'dog.11437.jpg',
       'dog.11526.jpg', 'dog.11538.jpg', 'dog.11724.jpg', 'dog.11731.jpg',
       'dog.1194.jpg', 'dog.12148.jpg', 'dog.12155.jpg', 'dog.12331.jpg',
       'dog.12376.jpg', 'dog.1259.jpg', 'dog.1308.jpg', 'dog.1483.jpg',
       'dog.1625.jpg', 'dog.1741.jpg', 'dog.1773.jpg', 'dog.1895.jpg',
       'dog.1985.jpg', 'dog.2339.jpg', 'dog.2422.jpg', 'dog.2478.jpg',
       'dog.2503.jpg', 'dog.2614.jpg', 'dog.2874.jpg', 'dog.2877.jpg',
       'dog.3035.jpg', 'dog.3135.jpg', 'dog.3139.jpg', 'dog.3330.jpg',
       'dog.3380.jpg', 'dog.3497.jpg', 'dog.3863.jpg', 'dog.3889.jpg',
       'dog.4113.jpg', 'dog.4218.jpg', 'dog.4334.jpg', 'dog.4367.jpg',
       'dog.4507.jpg', 'dog.4595.jpg', 'dog.4712.jpg', 'dog.5336.jpg',
       'dog.5604.jpg', 'dog.5880.jpg', 'dog.6028.jpg', 'dog.6413.jpg',
       'dog.6475.jpg', 'dog.6725.jpg', 'dog.7019.jpg', 'dog.7076.jpg',
       'dog.729.jpg', 'dog.7294.jpg', 'dog.7857.jpg', 'dog.806.jpg',
       'dog.8142.jpg', 'dog.8507.jpg', 'dog.8736.jpg', 'dog.8739.jpg',
       'dog.8898.jpg', 'dog.9188.jpg', 'dog.9517.jpg', 'dog.9632.jpg'],
      dtype='<U13')

-----------------------

Dirty Data Visualization

-----------------------

In [13]:
# Function that let us see some abnormal images
def view_of_cheat_files(nrows, ncols, start, end):
    if abs(end - start) > nrows * ncols:
        raise IndexError("index out of range!")
    
    fig, ax = plt.subplots(nrows, ncols, figsize=(25, 25))
    plt.subplots_adjust(hspace=.9)

    for num in range(start, end):
        file_name = cheatfiles_all[num]
        file_path = FOLDER_TRN / cheatfiles_all[num]
        label = file_name.split('.')[0]
        img = plt.imread(file_path)
    
        ax[num // ncols, num % ncols].imshow(img)
        ax[num // ncols, num % ncols].set_title("file: {}\n label: {}".format(file_name, label), size=12)
        ax[num // ncols, num % ncols].get_xaxis().set_visible(False)
        ax[num // ncols, num % ncols].get_yaxis().set_visible(False)
In [23]:
# view of the top 64 abnormal images
view_of_cheat_files(8, 8, 0, 64)
In [24]:
# view of the last 64 abnormal images
view_of_cheat_files(8, 8, -64, 0)

-----------------------

Dirty Data Removing

-----------------------

In [99]:
# Remove above cheat files & bulid a dataframe
for file in cheatfiles_all:
    fnames.remove(file)

df = pd.DataFrame({
    'file': fnames,
    'label': [name.split('.')[0] for name in fnames]
})
df.head()
Out[99]:
file label
0 cat.2960.jpg cat
1 dog.11107.jpg dog
2 cat.3056.jpg cat
3 cat.12279.jpg cat
4 dog.7786.jpg dog
In [100]:
# Save as csv file
df.to_csv('data_clean.csv', index=False)

Training Set & Validation Set

In [101]:
df = pd.read_csv('data_clean.csv')
df.head()
Out[101]:
file label
0 cat.2960.jpg cat
1 dog.11107.jpg dog
2 cat.3056.jpg cat
3 cat.12279.jpg cat
4 dog.7786.jpg dog
In [10]:
from sklearn.model_selection import train_test_split


# Dataframe of training set & validation set
df_trn, df_val = train_test_split(df, stratify=df['label'], random_state=0)
df_trn.reset_index(drop=True, inplace=True)
df_val.reset_index(drop=True, inplace=True)
In [11]:
def show_images_from_df(data_df, nrows, ncols, directory=FOLDER_TRN, is_train=True, pred_label=None):
    """
        showing images from dataframe.
    """
    
    fig, ax = plt.subplots(nrows, ncols, figsize=(15, 15))
    plt.subplots_adjust(wspace=.4, hspace=.5)
    
    for idx, row in data_df[:nrows*ncols].iterrows():
        fname = row['file']
        prob = pred_label[idx] if not is_train else 1.000
        label = row['label'] if is_train  else ('dog' if prob > 0.500 else 'cat')
        img = plt.imread(os.path.join(directory, fname))
        
        ax[idx // ncols, idx % ncols].imshow(img)
        if is_train:
            ax[idx // ncols, idx % ncols].set_title("file: {}\n label: {}".format(fname, label), size=12)
        else:
            ax[idx // ncols, idx % ncols].set_title("file: {}\n prediction: {}\nprobability:{:.3f}".format(fname, label, prob), size=12)

        ax[idx // ncols, idx % ncols].get_xaxis().set_visible(False)
        ax[idx // ncols, idx % ncols].get_yaxis().set_visible(False)
In [30]:
# Show images of training data
show_images_from_df(df_trn, 5, 5)
In [31]:
# Show images of validation data
show_images_from_df(df_val, 5, 5)

Data Augmentation

In [12]:
from keras.preprocessing.image import ImageDataGenerator


# Batch size
BATCH_SIZE_331 = 16
BATCH_SIZE_299 = 32

# Image size
SIZE_299 = (299, 299)
SIZE_331 = (331, 331)

def generator_flow(
    datagen, df, target_size, batch_size,
    directory=FOLDER_TRN, x_col='file', y_col='label',
    mode='binary', shuffle=True
):
    gen = datagen.flow_from_dataframe(
        df, 
        directory=directory, 
        x_col=x_col,
        y_col=y_col, 
        target_size=target_size, 
        class_mode=mode,
        batch_size=batch_size,
        shuffle=shuffle
    )
    return gen

datagen_trn = ImageDataGenerator(
    rotation_range=25,
    
    width_shift_range=.1,
    height_shift_range=.1,
    zoom_range=.2,
    shear_range=.1,
    horizontal_flip=True
)
datagen_val = ImageDataGenerator()
In [13]:
# Data generator of training data & validation data with different size
generator_trn_331 = generator_flow(datagen_trn, df_trn, SIZE_331, BATCH_SIZE_331)
generator_val_331 = generator_flow(datagen_val, df_val, SIZE_331, BATCH_SIZE_331)
generator_trn_299 = generator_flow(datagen_trn, df_trn, SIZE_299, BATCH_SIZE_299)
generator_val_299 = generator_flow(datagen_val, df_val, SIZE_299, BATCH_SIZE_299)
Found 18429 images belonging to 2 classes.
Found 6143 images belonging to 2 classes.
Found 18429 images belonging to 2 classes.
Found 6143 images belonging to 2 classes.
In [83]:
# Class indices
print(generator_trn_331.class_indices)
print(generator_trn_299.class_indices)
print(generator_val_331.class_indices)
print(generator_val_299.class_indices)
{'cat': 0, 'dog': 1}
{'cat': 0, 'dog': 1}
{'cat': 0, 'dog': 1}
{'cat': 0, 'dog': 1}
In [14]:
# define a function for sorting the testing images in their directory by number in image name
import re


def key_func(entry):
    """
        sort files in their directory by number of its name.
    """
    return int(re.search(r'\d+', entry).group())
In [15]:
# Sort testing data
FOLDER_TEST = Path('test/')

fnames_tst = os.listdir(FOLDER_TEST)
fnames_tst.sort(key=key_func)
fnames_tst[:10]
Out[15]:
['1.jpg',
 '2.jpg',
 '3.jpg',
 '4.jpg',
 '5.jpg',
 '6.jpg',
 '7.jpg',
 '8.jpg',
 '9.jpg',
 '10.jpg']
In [16]:
# Generator for testing data, set 'shuffle'=False
df_tst = pd.DataFrame({
    'file': fnames_tst
})
datagen_tst = ImageDataGenerator()
generator_tst_331 = generator_flow(datagen_tst, df_tst, (331, 331), 16, directory=FOLDER_TEST, y_col=None, mode=None, shuffle=False)
generator_tst_299 = generator_flow(datagen_tst, df_tst, SIZE_299, BATCH_SIZE_299, directory=FOLDER_TEST, y_col=None, mode=None, shuffle=False)
Found 12500 images.
Found 12500 images.

***

Transfer Learning

Pretrained Xception, InceptionV3, NASNetLarge & InceptionResNetV2

Model Building

***

In [17]:
from keras.models import Model
from keras.layers import Input, Lambda


SHAPE_299 = (299, 299, 3)
SHAPE_331 = (331, 331, 3)


def build_pretrained_model(base_model, input_shape, preprocess_func, weight_scheme='imagenet', trainable=False):
    """
        pretrained model, remove top level, preprocess inputs, for extrating features.
    """
    model_input = Input(shape=input_shape) 
    tensor = Lambda(preprocess_func)(model_input)
    pretrained_model = base_model(input_tensor=tensor, weights=weight_scheme, include_top=False, pooling='avg')
    model = Model(model_input, pretrained_model.output)
    model.trainable = trainable
    
    return model
In [18]:
# Build pretrained Xception & InceptionV3 model with input shape (299, 299, 3) 
model_xception = build_pretrained_model(Xception, SHAPE_299, xception.preprocess_input)
model_inceptionv3 = build_pretrained_model(InceptionV3, SHAPE_299, inception_v3.preprocess_input)

# Build pretrained NASNetLarge & InceptionResNetV2 model with input shape (331, 331, 3) 
model_nasnet = build_pretrained_model(NASNetLarge, SHAPE_331, nasnet.preprocess_input)
model_inception_resnet = build_pretrained_model(InceptionResNetV2, SHAPE_331, inception_resnet_v2.preprocess_input)
WARNING:tensorflow:From /home/ubuntu/anaconda3/envs/tensorflow_p36/lib/python3.6/site-packages/tensorflow/python/framework/op_def_library.py:263: colocate_with (from tensorflow.python.framework.ops) is deprecated and will be removed in a future version.
Instructions for updating:
Colocations handled automatically by placer.
In [19]:
from keras.layers import Concatenate, Dense, Dropout, Flatten, BatchNormalization, Activation


def build_model(pretrained_models, input_shape):
    """
        build model with features extracted by pretained models, for predicting testing data.
    """
    model_input = Input(shape=input_shape)
    extracted_features = [pm(model_input) for pm in pretrained_models]

    x = Concatenate()(extracted_features)
    x = Dropout(rate=.5, name='dropout')(x)
    
    if x.shape[-1] == 4096:
        x = Dense(1024, name='dense1')(x)
        x = BatchNormalization()(x)
        x = Activation('relu')(x)
        x = Dropout(rate=.5, name='drop1')(x)
        x = Dense(16, name='dense2')(x)
        x = Activation('relu')(x)
        x = Dropout(rate=.3, name='drop2')(x)

    else:
        x = Dense(128, name='dense1')(x)
        x = BatchNormalization()(x)
        x = Activation('relu')(x)
        x = Dropout(rate=.3, name='drop1')(x)
    
    prob = Dense(1, activation='sigmoid', name='prob')(x)
    model = Model(inputs=model_input, outputs=prob)
    
    return model
In [20]:
# Build model with input shape (299, 299, 3)
model_299 = build_model([model_xception, model_inceptionv3], SHAPE_299)
# Build model with input shape (331, 331, 3)
model_331 = build_model([model_nasnet, model_inception_resnet], SHAPE_331)
WARNING:tensorflow:From /home/ubuntu/anaconda3/envs/tensorflow_p36/lib/python3.6/site-packages/keras/backend/tensorflow_backend.py:3445: calling dropout (from tensorflow.python.ops.nn_ops) with keep_prob is deprecated and will be removed in a future version.
Instructions for updating:
Please use `rate` instead of `keep_prob`. Rate should be set to `rate = 1 - keep_prob`.
In [80]:
# Summary of Model with input shape (299, 299, 3)
model_299.summary()
__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
==================================================================================================
input_10 (InputLayer)           (None, 299, 299, 3)  0                                            
__________________________________________________________________________________________________
model_8 (Model)                 (None, 2048)         20861480    input_10[0][0]                   
__________________________________________________________________________________________________
model_9 (Model)                 (None, 2048)         21802784    input_10[0][0]                   
__________________________________________________________________________________________________
concatenate_10 (Concatenate)    (None, 4096)         0           model_8[1][0]                    
                                                                 model_9[1][0]                    
__________________________________________________________________________________________________
dropout (Dropout)               (None, 4096)         0           concatenate_10[0][0]             
__________________________________________________________________________________________________
dense1 (Dense)                  (None, 1024)         4195328     dropout[0][0]                    
__________________________________________________________________________________________________
batch_normalization_298 (BatchN (None, 1024)         4096        dense1[0][0]                     
__________________________________________________________________________________________________
activation_287 (Activation)     (None, 1024)         0           batch_normalization_298[0][0]    
__________________________________________________________________________________________________
drop1 (Dropout)                 (None, 1024)         0           activation_287[0][0]             
__________________________________________________________________________________________________
dense2 (Dense)                  (None, 16)           16400       drop1[0][0]                      
__________________________________________________________________________________________________
activation_288 (Activation)     (None, 16)           0           dense2[0][0]                     
__________________________________________________________________________________________________
drop2 (Dropout)                 (None, 16)           0           activation_288[0][0]             
__________________________________________________________________________________________________
prob (Dense)                    (None, 1)            17          drop2[0][0]                      
==================================================================================================
Total params: 46,880,105
Trainable params: 4,213,793
Non-trainable params: 42,666,312
__________________________________________________________________________________________________
In [68]:
# Summary of Model with input shape (224, 224, 3)
model_331.summary()
__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
==================================================================================================
input_8 (InputLayer)            (None, 331, 331, 3)  0                                            
__________________________________________________________________________________________________
model_1 (Model)                 (None, 4032)         84916818    input_8[0][0]                    
__________________________________________________________________________________________________
model_2 (Model)                 (None, 1536)         54336736    input_8[0][0]                    
__________________________________________________________________________________________________
concatenate_12 (Concatenate)    (None, 5568)         0           model_1[4][0]                    
                                                                 model_2[4][0]                    
__________________________________________________________________________________________________
drop_0 (Dropout)                (None, 5568)         0           concatenate_12[0][0]             
__________________________________________________________________________________________________
dense_0 (Dense)                 (None, 128)          712832      drop_0[0][0]                     
__________________________________________________________________________________________________
batch_normalization_409 (BatchN (None, 128)          512         dense_0[0][0]                    
__________________________________________________________________________________________________
activation_927 (Activation)     (None, 128)          0           batch_normalization_409[0][0]    
__________________________________________________________________________________________________
drop_1 (Dropout)                (None, 128)          0           activation_927[0][0]             
__________________________________________________________________________________________________
dense_1 (Dense)                 (None, 1)            129         drop_1[0][0]                     
==================================================================================================
Total params: 139,967,027
Trainable params: 713,217
Non-trainable params: 139,253,810
__________________________________________________________________________________________________

Model Visualization

In [41]:
# Show the model graphical representation
from IPython.display import SVG
from keras.utils import plot_model
from keras.utils.vis_utils import model_to_dot


GRAPHFILE='model299.png'
plot_model(model_299, to_file=GRAPHFILE)
SVG(model_to_dot(model_299).create(prog='dot', format='svg'))
Out[41]:
G 140629477960392 input_7: InputLayer 140629189886024 model_5: Model 140629477960392->140629189886024 140629074744320 model_6: Model 140629477960392->140629074744320 140629060396480 concatenate_7: Concatenate 140629189886024->140629060396480 140629074744320->140629060396480 140629060396368 dropout: Dropout 140629060396480->140629060396368 140629060395640 dense1: Dense 140629060396368->140629060395640 140628937290080 batch_normalization_199: BatchNormalization 140629060395640->140628937290080 140628937085056 activation_191: Activation 140628937290080->140628937085056 140628936694976 drop1: Dropout 140628937085056->140628936694976 140628936693296 dense2: Dense 140628936694976->140628936693296 140628935711824 activation_192: Activation 140628936693296->140628935711824 140628935469656 drop2: Dropout 140628935711824->140628935469656 140628936110768 prob: Dense 140628935469656->140628936110768
In [86]:
GRAPHFILE='model331.png'
plot_model(model_331, to_file=GRAPHFILE)
SVG(model_to_dot(model_331).create(prog='dot', format='svg'))
Out[86]:
G 140630645095560 input_13: InputLayer 140630645094944 model_11: Model 140630645095560->140630645094944 140627598875280 model_12: Model 140630645095560->140627598875280 140627598578632 concatenate_15: Concatenate 140630645094944->140627598578632 140627598875280->140627598578632 140627319018104 dropout: Dropout 140627598578632->140627319018104 140630645094048 dense1: Dense 140627319018104->140630645094048 140627130007448 batch_normalization_502: BatchNormalization 140630645094048->140627130007448 140627129387664 activation_752: Activation 140627130007448->140627129387664 140627128993552 drop1: Dropout 140627129387664->140627128993552 140627128995232 prob: Dense 140627128993552->140627128995232

Model training

In [21]:
from keras.optimizers import SGD


# Model compiling
model_299.compile(loss='binary_crossentropy', metrics=['accuracy'], optimizer='rmsprop')
model_331.compile(loss='binary_crossentropy', metrics=['accuracy'], optimizer=SGD(lr=9e-3, momentum=.9, decay=1e-4, nesterov=True))
In [22]:
from keras.callbacks import ModelCheckpoint, EarlyStopping

import gc


# Traininig epochs
EPOCHS = 10

# Files storing model best weights
WEIGHTFILE299 = 'model299.weights.best.hdf5'
WEIGHTFILE331 = 'model331.weights.best.hdf5'

# Model check point
ckp299 = ModelCheckpoint(
    filepath=WEIGHTFILE299,
    save_best_only=True,
    verbose=1
)

ckp331 = ModelCheckpoint(
    filepath=WEIGHTFILE331,
    save_best_only=True,
    verbose=1
)

# Early stopping
es = EarlyStopping(patience=3, verbose=1)
In [59]:
# Garbage collection
gc.collect()

# Model traininig with image input shape=(299, 299, 3)
history_299 = model_299.fit_generator(
    generator_trn_299, 
    steps_per_epoch=len(df_trn) // BATCH_SIZE_299,
    epochs=EPOCHS,
    callbacks=[ckp299, es],
    validation_data=generator_val_299,
    validation_steps=len(df_val) // BATCH_SIZE_299,
    verbose=1
)
Epoch 1/10
575/575 [==============================] - 700s 1s/step - loss: 0.1087 - acc: 0.9682 - val_loss: 0.0674 - val_acc: 0.9910

Epoch 00001: val_loss improved from inf to 0.06740, saving model to model299.0823.1218.weights.best.hdf5
Epoch 2/10
575/575 [==============================] - 673s 1s/step - loss: 0.0840 - acc: 0.9773 - val_loss: 0.0154 - val_acc: 0.9974

Epoch 00002: val_loss improved from 0.06740 to 0.01544, saving model to model299.0823.1218.weights.best.hdf5
Epoch 3/10
575/575 [==============================] - 674s 1s/step - loss: 0.0736 - acc: 0.9793 - val_loss: 0.0225 - val_acc: 0.9967

Epoch 00003: val_loss did not improve from 0.01544
Epoch 4/10
575/575 [==============================] - 674s 1s/step - loss: 0.0742 - acc: 0.9808 - val_loss: 0.0395 - val_acc: 0.9943

Epoch 00004: val_loss did not improve from 0.01544
Epoch 5/10
575/575 [==============================] - 674s 1s/step - loss: 0.0709 - acc: 0.9812 - val_loss: 0.0113 - val_acc: 0.9977

Epoch 00005: val_loss improved from 0.01544 to 0.01134, saving model to model299.0823.1218.weights.best.hdf5
Epoch 6/10
575/575 [==============================] - 674s 1s/step - loss: 0.0665 - acc: 0.9819 - val_loss: 0.0240 - val_acc: 0.9959

Epoch 00006: val_loss did not improve from 0.01134
Epoch 7/10
575/575 [==============================] - 674s 1s/step - loss: 0.0603 - acc: 0.9837 - val_loss: 0.0330 - val_acc: 0.9931

Epoch 00007: val_loss did not improve from 0.01134
Epoch 8/10
575/575 [==============================] - 674s 1s/step - loss: 0.0609 - acc: 0.9816 - val_loss: 0.0494 - val_acc: 0.9902

Epoch 00008: val_loss did not improve from 0.01134
Epoch 00008: early stopping
In [61]:
# Load the best weight gained from traninig
model_299.load_weights(WEIGHTFILE299)
In [24]:
# Garbage collection
gc.collect()

# Model traininig with image input shape=(331, 331, 3)
history_331 = model_331.fit_generator(
    generator_trn_331, 
    steps_per_epoch=len(df_trn) // BATCH_SIZE_331,
    epochs=EPOCHS,
    callbacks=[ckp331, es],
    validation_data=generator_val_331,
    validation_steps=len(df_val) // BATCH_SIZE_331,
    verbose=1
)
WARNING:tensorflow:From /home/ubuntu/anaconda3/envs/tensorflow_p36/lib/python3.6/site-packages/tensorflow/python/ops/math_ops.py:3066: to_int32 (from tensorflow.python.ops.math_ops) is deprecated and will be removed in a future version.
Instructions for updating:
Use tf.cast instead.
Epoch 1/10
1151/1151 [==============================] - 2624s 2s/step - loss: 0.0429 - acc: 0.9851 - val_loss: 0.0222 - val_acc: 0.9930

Epoch 00001: val_loss improved from inf to 0.02225, saving model to model331.weights.best.hdf5
Epoch 2/10
1151/1151 [==============================] - 2585s 2s/step - loss: 0.0409 - acc: 0.9848 - val_loss: 0.0191 - val_acc: 0.9936

Epoch 00002: val_loss improved from 0.02225 to 0.01910, saving model to model331.weights.best.hdf5
Epoch 3/10
1151/1151 [==============================] - 2582s 2s/step - loss: 0.0377 - acc: 0.9854 - val_loss: 0.0208 - val_acc: 0.9938

Epoch 00003: val_loss did not improve from 0.01910
Epoch 4/10
1151/1151 [==============================] - 2580s 2s/step - loss: 0.0359 - acc: 0.9862 - val_loss: 0.0227 - val_acc: 0.9925

Epoch 00004: val_loss did not improve from 0.01910
Epoch 5/10
1151/1151 [==============================] - 2581s 2s/step - loss: 0.0399 - acc: 0.9861 - val_loss: 0.0151 - val_acc: 0.9953

Epoch 00005: val_loss improved from 0.01910 to 0.01512, saving model to model331.weights.best.hdf5
Epoch 6/10
1151/1151 [==============================] - 2581s 2s/step - loss: 0.0339 - acc: 0.9873 - val_loss: 0.0173 - val_acc: 0.9951

Epoch 00006: val_loss did not improve from 0.01512
Epoch 7/10
1151/1151 [==============================] - 2581s 2s/step - loss: 0.0359 - acc: 0.9884 - val_loss: 0.0299 - val_acc: 0.9904

Epoch 00007: val_loss did not improve from 0.01512
Epoch 8/10
1151/1151 [==============================] - 2580s 2s/step - loss: 0.0319 - acc: 0.9883 - val_loss: 0.0226 - val_acc: 0.9930

Epoch 00008: val_loss did not improve from 0.01512
Epoch 00008: early stopping
In [25]:
# Load the best weight gained from traninig
model_331.load_weights(WEIGHTFILE331)

Training History Visualization

In [26]:
# Visualization of accuracy&loss for training step&validation step
def view_acc_and_loss(his):
    fig, axes = plt.subplots(1, 2, figsize=(18,5))

    for i, c in enumerate(['acc', 'loss']):
        axes[i].plot(his[c], label=f'Training {c}')
        axes[i].plot(his[f'val_{c}'], label=f'Validation {c}')
        axes[i].set_xlabel('epoch')
        axes[i].set_ylabel(c);
        axes[i].legend()
        axes[i].set_title(f'Training and Validation {c}')
        plt.grid()
    
    plt.show()
In [60]:
# Plotting loss and accuracy for model_299
view_acc_and_loss(history_299.history)
In [27]:
# Plotting loss and accuracy for model_331(SGD with lr=9e-3)
view_acc_and_loss(history_331.history)

Model Prediction

In [78]:
# Model_299 prediction
pred_299 = model_299.predict_generator(generator_tst_299, steps=np.ceil(len(df_tst) / BATCH_SIZE_299))
# Clip into range (0.005, 0.995), and reduce to 1 dimension
prob_299 = pred_299.clip(min=0.005, max=0.995).ravel()
In [28]:
# Model_331 prediction
pred_331 = model_331.predict_generator(generator_tst_331, steps=np.ceil(len(df_tst) / BATCH_SIZE_331))
# Clip into range (0.005, 0.995), and reduce to 1 dimension
prob_331 = pred_331.clip(min=0.005, max=0.995).ravel()
In [65]:
# Performance on testing set by model_299
show_images_from_df(df_tst, 6, 6, directory='test/', is_train=False, pred_label=prob_299)
In [29]:
# Performance on testing set by model_331
show_images_from_df(df_tst, 6, 6, directory=FOLDER_TEST, is_train=False, pred_label=prob_331)

Final Result

In [79]:
# Generate submission file for model_299
submission_df = pd.read_csv('sample_submission.csv')
for idx, fname in enumerate(fnames_tst):
    idx_df = int(fname.split('.')[0]) - 1
    submission_df.at[idx_df, 'label'] = prob_299[idx]

SUBMITFILE_299 = 'submission_299.csv'
submission_df.to_csv(SUBMITFILE_299, index=False)
submission_df.head()
Out[79]:
id label
0 1 0.995
1 2 0.995
2 3 0.995
3 4 0.995
4 5 0.005
In [30]:
# Generate submission file fof model_331
submission_df = pd.read_csv('sample_submission.csv')
for idx, fname in enumerate(fnames_tst):
    idx_df = int(fname.split('.')[0]) - 1
    submission_df.at[idx_df, 'label'] = prob_331[idx]

SUBMITFILE_331= 'submission_331.csv'
submission_df.to_csv(SUBMITFILE_331, index=False)
submission_df.head()
Out[30]:
id label
0 1 0.995
1 2 0.995
2 3 0.995
3 4 0.995
4 5 0.005
In [51]:
img_path = 'lijiahe.jpg'
img = image.load_img(img_path, target_size=SIZE_331)
arr = image.img_to_array(img)
tensor = np.expand_dims(arr, axis=0)
preds = model.predict(tensor)
preds = preds.ravel()[0]

plt.imshow(img)
if preds < 0.500:
    plt.title("This is a {}, Probability:{:.8f}%".format('cat', (1-preds) * 100), size=12)
else:
    plt.title("This is a {}, Probability:{:.8f}%".format('dog', (1-preds) * 100), size=12)